"""A builtin set of models available in MLC LLM."""

from typing import Any, Dict  # pylint: disable=too-many-lines

# pylint: disable=too-many-lines

MODEL_PRESETS: Dict[str, Any] = {
    "llama2_7b": {
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 11008,
        "max_position_embeddings": 2048,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "pad_token_id": 0,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.31.0.dev0",
        "use_cache": True,
        "vocab_size": 32000,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "llama2_13b": {
        "_name_or_path": "meta-llama/Llama-2-13b-hf",
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 5120,
        "initializer_range": 0.02,
        "intermediate_size": 13824,
        "max_position_embeddings": 2048,
        "model_type": "llama",
        "num_attention_heads": 40,
        "num_hidden_layers": 40,
        "num_key_value_heads": 40,
        "pad_token_id": 0,
        "pretraining_tp": 2,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.31.0.dev0",
        "use_cache": True,
        "vocab_size": 32000,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "llama2_70b": {
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 8192,
        "initializer_range": 0.02,
        "intermediate_size": 28672,
        "max_position_embeddings": 2048,
        "model_type": "llama",
        "num_attention_heads": 64,
        "num_hidden_layers": 80,
        "num_key_value_heads": 8,
        "pad_token_id": 0,
        "rms_norm_eps": 1e-05,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.31.0.dev0",
        "use_cache": True,
        "vocab_size": 32000,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "codellama_7b": {
        "_name_or_path": "codellama/CodeLlama-7b-hf",
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 11008,
        "max_position_embeddings": 16384,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 1000000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.33.0.dev0",
        "use_cache": True,
        "vocab_size": 32016,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "codellama_13b": {
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 5120,
        "initializer_range": 0.02,
        "intermediate_size": 13824,
        "max_position_embeddings": 16384,
        "model_type": "llama",
        "num_attention_heads": 40,
        "num_hidden_layers": 40,
        "num_key_value_heads": 40,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 1000000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.32.0.dev0",
        "use_cache": True,
        "vocab_size": 32016,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "codellama_34b": {
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 8192,
        "initializer_range": 0.02,
        "intermediate_size": 22016,
        "max_position_embeddings": 16384,
        "model_type": "llama",
        "num_attention_heads": 64,
        "num_hidden_layers": 48,
        "num_key_value_heads": 8,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 1000000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.32.0.dev0",
        "use_cache": True,
        "vocab_size": 32016,
        "context_window_size": 2048,
        "prefill_chunk_size": 2048,
    },
    "tinyllama_1b_chat_v0.4": {
        "_name_or_path": "/data/tianduo/tinyllama-ft/checkpoint-3890",
        "architectures": ["LlamaForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 5632,
        "max_position_embeddings": 2048,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 22,
        "num_key_value_heads": 4,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "float32",
        "transformers_version": "4.33.1",
        "use_cache": False,
        "vocab_size": 32003,
    },
    "tinyllama_1b_chat_v1.0": {
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 5632,
        "max_position_embeddings": 2048,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 22,
        "num_key_value_heads": 4,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.35.0",
        "use_cache": True,
        "vocab_size": 32000,
    },
    "mistral_7b": {
        "architectures": ["MistralForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 32768,
        "model_type": "mistral",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "rms_norm_eps": 1e-05,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.34.0.dev0",
        "use_cache": True,
        "vocab_size": 32000,
        "sliding_window_size": 4096,
        "prefill_chunk_size": 128,
        "attention_sink_size": 4,
    },
    "mistral_7b_v03": {
        "architectures": ["MistralForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 32768,
        "model_type": "mistral",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "rms_norm_eps": 1e-05,
        "rope_theta": 1000000.0,
        "sliding_window": None,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.0.dev0",
        "use_cache": True,
        "vocab_size": 32768,
    },
    "gpt2": {
        "activation_function": "gelu_new",
        "architectures": ["GPT2LMHeadModel"],
        "attn_pdrop": 0.1,
        "bos_token_id": 50256,
        "embd_pdrop": 0.1,
        "eos_token_id": 50256,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "gpt2",
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "resid_pdrop": 0.1,
        "summary_activation": None,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": True,
        "summary_type": "cls_index",
        "summary_use_proj": True,
        "task_specific_params": {"text-generation": {"do_sample": True, "max_length": 50}},
        "vocab_size": 50257,
    },
    "gpt2_medium": {
        "activation_function": "gelu_new",
        "architectures": ["GPT2LMHeadModel"],
        "attn_pdrop": 0.1,
        "bos_token_id": 50256,
        "embd_pdrop": 0.1,
        "eos_token_id": 50256,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "gpt2",
        "n_ctx": 1024,
        "n_embd": 1024,
        "n_head": 16,
        "n_layer": 24,
        "n_positions": 1024,
        "n_special": 0,
        "predict_special_tokens": True,
        "resid_pdrop": 0.1,
        "summary_activation": None,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": True,
        "summary_type": "cls_index",
        "summary_use_proj": True,
        "task_specific_params": {"text-generation": {"do_sample": True, "max_length": 50}},
        "vocab_size": 50257,
    },
    "gpt_bigcode": {
        "activation_function": "gelu_pytorch_tanh",
        "architectures": ["GPTBigCodeForCausalLM"],
        "attention_softmax_in_fp32": True,
        "multi_query": True,
        "attn_pdrop": 0.1,
        "bos_token_id": 49152,
        "embd_pdrop": 0.1,
        "eos_token_id": 49152,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "gpt_bigcode",
        "n_embd": 2048,
        "n_head": 16,
        "n_inner": 8192,
        "n_layer": 24,
        "n_positions": 2048,
        "resid_pdrop": 0.1,
        "runner_max_sequence_length": None,
        "scale_attention_softmax_in_fp32": True,
        "scale_attn_weights": True,
        "summary_activation": None,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": True,
        "summary_type": "cls_index",
        "summary_use_proj": True,
        "transformers_version": "4.28.0.dev0",
        "use_cache": True,
        "vocab_size": 49280,
    },
    "Mixtral-8x7B-v0.1": {
        "architectures": ["MixtralForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 32768,
        "model_type": "mixtral",
        "num_attention_heads": 32,
        "num_experts_per_tok": 2,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "num_local_experts": 8,
        "output_router_logits": False,
        "rms_norm_eps": 1e-05,
        "rope_theta": 1000000.0,
        "router_aux_loss_coef": 0.02,
        "sliding_window": None,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.36.0.dev0",
        "use_cache": True,
        "vocab_size": 32000,
    },
    "redpajama_3b_v1": {
        "_name_or_path": "/root/fm/models/rp_3b_800b_real_fp16",
        "architectures": ["GPTNeoXForCausalLM"],
        "bos_token_id": 0,
        "eos_token_id": 0,
        "hidden_act": "gelu",
        "hidden_size": 2560,
        "initializer_range": 0.02,
        "intermediate_size": 10240,
        "layer_norm_eps": 1e-05,
        "max_position_embeddings": 2048,
        "model_type": "gpt_neox",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "rotary_emb_base": 10000,
        "rotary_pct": 1.0,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.28.1",
        "use_cache": True,
        "use_parallel_residual": False,
        "vocab_size": 50432,
    },
    "phi-1_5": {
        "_name_or_path": "microsoft/phi-1_5",
        "activation_function": "gelu_new",
        "architectures": ["PhiForCausalLM"],
        "attn_pdrop": 0.0,
        "auto_map": {
            "AutoConfig": "configuration_phi.PhiConfig",
            "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM",
        },
        "embd_pdrop": 0.0,
        "flash_attn": False,
        "flash_rotary": False,
        "fused_dense": False,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "phi-msft",
        "n_embd": 2048,
        "n_head": 32,
        "n_head_kv": None,
        "n_inner": None,
        "n_layer": 24,
        "n_positions": 2048,
        "resid_pdrop": 0.0,
        "rotary_dim": 32,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.34.1",
        "vocab_size": 51200,
    },
    "phi-2": {
        "_name_or_path": "microsoft/phi-2",
        "activation_function": "gelu_new",
        "architectures": ["PhiForCausalLM"],
        "attn_pdrop": 0.0,
        "auto_map": {
            "AutoConfig": "configuration_phi.PhiConfig",
            "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM",
        },
        "embd_pdrop": 0.0,
        "flash_attn": False,
        "flash_rotary": False,
        "fused_dense": False,
        "img_processor": None,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "phi-msft",
        "n_embd": 2560,
        "n_head": 32,
        "n_head_kv": None,
        "n_inner": None,
        "n_layer": 32,
        "n_positions": 2048,
        "resid_pdrop": 0.1,
        "rotary_dim": 32,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.35.2",
        "vocab_size": 51200,
    },
    # "phi-3": {
    #     "_name_or_path": "Phi-3-mini-4k-instruct",
    #     "architectures": ["Phi3ForCausalLM"],
    #     "attention_dropout": 0.0,
    #     "auto_map": {
    #         "AutoConfig": "configuration_phi3.Phi3Config",
    #         "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
    #     },
    #     "bos_token_id": 1,
    #     "embd_pdrop": 0.0,
    #     "eos_token_id": 32000,
    #     "hidden_act": "silu",
    #     "hidden_size": 3072,
    #     "initializer_range": 0.02,
    #     "intermediate_size": 8192,
    #     "max_position_embeddings": 4096,
    #     "model_type": "phi3",
    #     "num_attention_heads": 32,
    #     "num_hidden_layers": 32,
    #     "num_key_value_heads": 32,
    #     "original_max_position_embeddings": 4096,
    #     "pad_token_id": 32000,
    #     "resid_pdrop": 0.0,
    #     "rms_norm_eps": 1e-05,
    #     "rope_scaling": None,
    #     "rope_theta": 10000.0,
    #     "sliding_window": 2047,
    #     "tie_word_embeddings": False,
    #     "torch_dtype": "bfloat16",
    #     "transformers_version": "4.39.3",
    #     "use_cache": True,
    #     "vocab_size": 32064,
    # },
    "phi-3_5": {
        "_name_or_path": "Phi-3.5-mini-instruct",
        "architectures": ["Phi3ForCausalLM"],
        "attention_dropout": 0.0,
        "auto_map": {
            "AutoConfig": "configuration_phi3.Phi3Config",
            "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
        },
        "bos_token_id": 1,
        "embd_pdrop": 0.0,
        "eos_token_id": 32000,
        "hidden_act": "silu",
        "hidden_size": 3072,
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "max_position_embeddings": 131072,
        "model_type": "phi3",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "original_max_position_embeddings": 4096,
        "pad_token_id": 32000,
        "resid_pdrop": 0.0,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "long_factor": [
                1.0800000429153442,
                1.1100000143051147,
                1.1399999856948853,
                1.340000033378601,
                1.5899999141693115,
                1.600000023841858,
                1.6200000047683716,
                2.620000123977661,
                3.2300000190734863,
                3.2300000190734863,
                4.789999961853027,
                7.400000095367432,
                7.700000286102295,
                9.09000015258789,
                12.199999809265137,
                17.670000076293945,
                24.46000099182129,
                28.57000160217285,
                30.420001983642578,
                30.840002059936523,
                32.590003967285156,
                32.93000411987305,
                42.320003509521484,
                44.96000289916992,
                50.340003967285156,
                50.45000457763672,
                57.55000305175781,
                57.93000411987305,
                58.21000289916992,
                60.1400032043457,
                62.61000442504883,
                62.62000274658203,
                62.71000289916992,
                63.1400032043457,
                63.1400032043457,
                63.77000427246094,
                63.93000411987305,
                63.96000289916992,
                63.970001220703125,
                64.02999877929688,
                64.06999969482422,
                64.08000183105469,
                64.12000274658203,
                64.41000366210938,
                64.4800033569336,
                64.51000213623047,
                64.52999877929688,
                64.83999633789062,
            ],
            "short_factor": [
                1.0,
                1.0199999809265137,
                1.0299999713897705,
                1.0299999713897705,
                1.0499999523162842,
                1.0499999523162842,
                1.0499999523162842,
                1.0499999523162842,
                1.0499999523162842,
                1.0699999332427979,
                1.0999999046325684,
                1.1099998950958252,
                1.1599998474121094,
                1.1599998474121094,
                1.1699998378753662,
                1.2899998426437378,
                1.339999794960022,
                1.679999828338623,
                1.7899998426437378,
                1.8199998140335083,
                1.8499997854232788,
                1.8799997568130493,
                1.9099997282028198,
                1.9399996995925903,
                1.9899996519088745,
                2.0199997425079346,
                2.0199997425079346,
                2.0199997425079346,
                2.0199997425079346,
                2.0199997425079346,
                2.0199997425079346,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0299997329711914,
                2.0799996852874756,
                2.0899996757507324,
                2.189999580383301,
                2.2199995517730713,
                2.5899994373321533,
                2.729999542236328,
                2.749999523162842,
                2.8399994373321533,
            ],
            "type": "longrope",
        },
        "rope_theta": 10000.0,
        "sliding_window": 262144,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.43.3",
        "use_cache": True,
        "attention_bias": False,
        "vocab_size": 32064,
    },
    "phi-3_5-vision": {
        "_name_or_path": "Phi-3.5-vision-instruct",
        "architectures": ["Phi3VForCausalLM"],
        "attention_dropout": 0.0,
        "auto_map": {
            "AutoConfig": "configuration_phi3_v.Phi3VConfig",
            "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM",
        },
        "bos_token_id": 1,
        "embd_layer": {
            "embedding_cls": "image",
            "hd_transform_order": "sub_glb",
            "projection_cls": "mlp",
            "use_hd_transform": True,
            "with_learnable_separator": True,
        },
        "embd_pdrop": 0.0,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 3072,
        "img_processor": {
            "image_dim_out": 1024,
            "model_name": "openai/clip-vit-large-patch14-336",
            "name": "clip_vision_model",
            "num_img_tokens": 144,
        },
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "max_position_embeddings": 131072,
        "model_type": "phi3_v",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "original_max_position_embeddings": 4096,
        "pad_token_id": 32000,
        "resid_pdrop": 0.0,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "long_factor": [
                1.0800000429153442,
                1.1100000143051147,
                1.1399999856948853,
                1.340000033378601,
                1.5899999141693115,
                1.600000023841858,
                1.6200000047683716,
                2.620000123977661,
                3.2300000190734863,
                3.2300000190734863,
                4.789999961853027,
                7.400000095367432,
                7.700000286102295,
                9.09000015258789,
                12.199999809265137,
                17.670000076293945,
                24.46000099182129,
                28.57000160217285,
                30.420001983642578,
                30.840002059936523,
                32.590003967285156,
                32.93000411987305,
                42.320003509521484,
                44.96000289916992,
                50.340003967285156,
                50.45000457763672,
                57.55000305175781,
                57.93000411987305,
                58.21000289916992,
                60.1400032043457,
                62.61000442504883,
                62.62000274658203,
                62.71000289916992,
                63.1400032043457,
                63.1400032043457,
                63.77000427246094,
                63.93000411987305,
                63.96000289916992,
                63.970001220703125,
                64.02999877929688,
                64.06999969482422,
                64.08000183105469,
                64.12000274658203,
                64.41000366210938,
                64.4800033569336,
                64.51000213623047,
                64.52999877929688,
                64.83999633789062,
            ],
            "short_factor": [
                1.08,
                1.1,
                1.1300000000000001,
                1.2800000000000002,
                1.3100000000000003,
                1.4500000000000004,
                1.4500000000000004,
                1.9500000000000008,
                2.030000000000001,
                2.4299999999999926,
                2.5699999999999896,
                2.9499999999999815,
                3.729999999999965,
                3.869999999999962,
                4.189999999999955,
                4.43999999999995,
                4.6399999999999455,
                4.979999999999938,
                5.159999999999934,
                5.279999999999932,
                5.759999999999922,
                5.889999999999919,
                5.889999999999919,
                5.969999999999917,
                6.089999999999915,
                6.2799999999999105,
                6.7699999999999,
                6.8899999999998975,
                7.109999999999893,
                7.129999999999892,
                7.179999999999891,
                7.289999999999889,
                7.339999999999888,
                7.559999999999883,
                7.619999999999882,
                7.69999999999988,
                7.879999999999876,
                7.879999999999876,
                7.879999999999876,
                7.939999999999875,
                7.949999999999875,
                7.979999999999874,
                8.19999999999987,
                8.439999999999864,
                8.469999999999864,
                8.589999999999861,
                8.809999999999857,
                8.999999999999853,
            ],
            "type": "su",
        },
        "rope_theta": 10000.0,
        "sliding_window": 262144,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.38.1",
        "use_cache": True,
        "vocab_size": 32064,
        "_attn_implementation": "flash_attention_2",
    },
    "phi-4": {
        "_name_or_path": "Phi-4-mini-instruct",
        "architectures": ["Phi3ForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "auto_map": {
            "AutoConfig": "configuration_phi3.Phi3Config",
            "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
            "AutoTokenizer": "Xenova/gpt-4o",
        },
        "bos_token_id": 199999,
        "embd_pdrop": 0.0,
        "eos_token_id": 199999,
        "full_attn_mod": 1,
        "hidden_act": "silu",
        "hidden_size": 3072,
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "interpolate_factor": 1,
        "lm_head_bias": False,
        "max_position_embeddings": 131072,
        "mlp_bias": False,
        "model_type": "phi3",
        "num_attention_heads": 24,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "original_max_position_embeddings": 4096,
        "pad_token_id": 199999,
        "partial_rotary_factor": 0.75,
        "resid_pdrop": 0.0,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "long_factor": [
                1,
                1.118320672,
                1.250641126,
                1.398617824,
                1.564103225,
                1.74916897,
                1.956131817,
                2.187582649,
                2.446418898,
                2.735880826,
                3.059592084,
                3.421605075,
                3.826451687,
                4.279200023,
                4.785517845,
                5.351743533,
                5.984965424,
                6.693110555,
                7.485043894,
                8.370679318,
                9.36110372,
                10.4687158,
                11.70738129,
                13.09260651,
                14.64173252,
                16.37415215,
                18.31155283,
                20.47818807,
                22.90118105,
                25.61086418,
                28.64115884,
                32.03,
                32.1,
                32.13,
                32.23,
                32.6,
                32.61,
                32.64,
                32.66,
                32.7,
                32.71,
                32.93,
                32.97,
                33.28,
                33.49,
                33.5,
                44.16,
                47.77,
            ],
            "short_factor": [
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
            ],
            "type": "longrope",
        },
        "rope_theta": 10000.0,
        "sliding_window": 262144,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.45.0",
        "use_cache": True,
        "vocab_size": 200064,
    },
    "qwen": {
        "architectures": ["QWenLMHeadModel"],
        "auto_map": {
            "AutoConfig": "configuration_qwen.QWenConfig",
            "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel",
        },
        "attn_dropout_prob": 0.0,
        "bf16": False,
        "emb_dropout_prob": 0.0,
        "hidden_size": 2048,
        "intermediate_size": 11008,
        "initializer_range": 0.02,
        "kv_channels": 128,
        "layer_norm_epsilon": 1e-06,
        "max_position_embeddings": 8192,
        "model_type": "qwen",
        "no_bias": True,
        "num_attention_heads": 16,
        "num_hidden_layers": 24,
        "rotary_emb_base": 10000,
        "rotary_pct": 1.0,
        "scale_attn_weights": True,
        "seq_length": 8192,
        "tie_word_embeddings": False,
        "tokenizer_class": "QWenTokenizer",
        "transformers_version": "4.32.0",
        "use_cache": True,
        "use_dynamic_ntk": True,
        "use_flash_attn": "auto",
        "use_logn_attn": True,
        "vocab_size": 151936,
    },
    "qwen2": {
        "_name_or_path": "Qwen/Qwen1.5-1.8B-Chat",
        "architectures": ["Qwen2ForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 5504,
        "max_position_embeddings": 4096,
        "max_window_layers": 21,
        "model_type": "qwen2",
        "num_attention_heads": 16,
        "num_hidden_layers": 24,
        "num_key_value_heads": 16,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 32768,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.37.2",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 151936,
    },
    "qwen2moe": {
        "architectures": ["Qwen2MoeForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 5632,
        "max_position_embeddings": 32768,
        "max_window_layers": 21,
        "model_type": "qwen2_moe",
        "num_attention_heads": 16,
        "num_hidden_layers": 24,
        "num_key_value_heads": 16,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 32768,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.39.0.dev0",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 151936,
        "decoder_sparse_step": 1,
        "moe_intermediate_size": 1408,
        "shared_expert_intermediate_size": 5632,
        "num_experts_per_tok": 4,
        "num_experts": 60,
        "norm_topk_prob": False,
        "output_router_logits": False,
        "router_aux_loss_coef": 0.001,
    },
    "deepseek_v2_lite": {
        "architectures": ["DeepseekV2ForCausalLM"],
        "attention_bias": False,
        "bos_token_id": 100000,
        "eos_token_id": 100001,
        "first_k_dense_replace": 1,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 10944,
        "kv_lora_rank": 512,
        "max_position_embeddings": 163840,
        "model_type": "deepseek_v2",
        "moe_intermediate_size": 1408,
        "moe_layer_freq": 1,
        "n_group": 1,
        "n_routed_experts": 64,
        "n_shared_experts": 2,
        "norm_topk_prob": False,
        "num_attention_heads": 16,
        "num_experts_per_tok": 6,
        "num_hidden_layers": 27,
        "num_key_value_heads": 16,
        "pretraining_tp": 1,
        "qk_nope_head_dim": 128,
        "qk_rope_head_dim": 64,
        "rms_norm_eps": 1e-06,
        "rope_scaling": {
            "beta_fast": 32,
            "beta_slow": 1,
            "factor": 40,
            "mscale": 0.707,
            "mscale_all_dim": 0.707,
            "original_max_position_embeddings": 4096,
            "type": "yarn",
        },
        "rope_theta": 10000,
        "routed_scaling_factor": 1.0,
        "scoring_func": "softmax",
        "topk_group": 1,
        "topk_method": "greedy",
        "torch_dtype": "bfloat16",
        "transformers_version": "4.33.1",
        "use_cache": True,
        "v_head_dim": 128,
        "vocab_size": 102400,
    },
    "stablelm": {
        "architectures": ["StableLmForCausalLM"],
        "bos_token_id": 0,
        "eos_token_id": 0,
        "hidden_act": "silu",
        "hidden_size": 2560,
        "initializer_range": 0.02,
        "intermediate_size": 6912,
        "max_position_embeddings": 4096,
        "model_type": "stablelm",
        "layer_norm_eps": 1e-05,
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "partial_rotary_factor": 0.25,
        "rope_theta": 10000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.38.0",
        "use_cache": True,
        "use_qkv_bias": False,
        "vocab_size": 50304,
    },
    "baichuan": {
        "architectures": ["BaichuanForCausalLM"],
        "tokenizer_class": "BaichuanTokenizer",
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 11008,
        "max_position_embeddings": 4096,
        "model_max_length": 4096,
        "model_type": "baichuan",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "pad_token_id": 0,
        "rms_norm_eps": 1e-06,
        "_from_model_config": True,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.29.2",
        "use_cache": True,
        "vocab_size": 125696,
    },
    "internlm": {
        "architectures": ["InternLMForCausalLM"],
        "bias": True,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 11008,
        "max_position_embeddings": 2048,
        "model_type": "internlm",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "pad_token_id": 2,
        "rms_norm_eps": 1e-06,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.33.2",
        "use_cache": True,
        "vocab_size": 103168,
    },
    "gemma2_2b": {
        "architectures": ["Gemma2ForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "attn_logit_softcapping": 50.0,
        "bos_token_id": 2,
        "cache_implementation": "hybrid",
        "eos_token_id": [1, 107],
        "final_logit_softcapping": 30.0,
        "head_dim": 256,
        "hidden_act": "gelu_pytorch_tanh",
        "hidden_activation": "gelu_pytorch_tanh",
        "hidden_size": 2304,
        "initializer_range": 0.02,
        "intermediate_size": 9216,
        "max_position_embeddings": 8192,
        "model_type": "gemma2",
        "num_attention_heads": 8,
        "num_hidden_layers": 26,
        "num_key_value_heads": 4,
        "pad_token_id": 0,
        "query_pre_attn_scalar": 256,
        "rms_norm_eps": 1e-06,
        "rope_theta": 10000.0,
        "sliding_window": 4096,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.4",
        "use_cache": True,
        "vocab_size": 256000,
    },
    "gemma2_9b": {
        "architectures": ["Gemma2ForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "attn_logit_softcapping": 50.0,
        "bos_token_id": 2,
        "cache_implementation": "hybrid",
        "eos_token_id": 1,
        "final_logit_softcapping": 30.0,
        "head_dim": 256,
        "hidden_act": "gelu_pytorch_tanh",
        "hidden_activation": "gelu_pytorch_tanh",
        "hidden_size": 3584,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 8192,
        "model_type": "gemma2",
        "num_attention_heads": 16,
        "num_hidden_layers": 42,
        "num_key_value_heads": 8,
        "pad_token_id": 0,
        "query_pre_attn_scalar": 256,
        "rms_norm_eps": 1e-06,
        "rope_theta": 10000.0,
        "sliding_window": 4096,
        "sliding_window_size": 4096,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.0.dev0",
        "use_cache": True,
        "vocab_size": 256000,
    },
    "gemma2_27b": {
        "architectures": ["Gemma2ForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "attn_logit_softcapping": 50.0,
        "bos_token_id": 2,
        "cache_implementation": "hybrid",
        "eos_token_id": 1,
        "final_logit_softcapping": 30.0,
        "head_dim": 128,
        "hidden_act": "gelu_pytorch_tanh",
        "hidden_activation": "gelu_pytorch_tanh",
        "hidden_size": 4608,
        "initializer_range": 0.02,
        "intermediate_size": 36864,
        "max_position_embeddings": 8192,
        "model_type": "gemma2",
        "num_attention_heads": 32,
        "num_hidden_layers": 46,
        "num_key_value_heads": 16,
        "pad_token_id": 0,
        "query_pre_attn_scalar": 144,
        "rms_norm_eps": 1e-06,
        "rope_theta": 10000.0,
        "sliding_window": 4096,
        "sliding_window_size": 4096,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.0.dev0",
        "use_cache": True,
        "vocab_size": 256000,
        "_attn_implementation": "eager",
    },
    "rwkv5_3b": {
        "architectures": ["RwkvForCausalLM"],
        "auto_map": {
            "AutoConfig": "configuration_rwkv5.Rwkv5Config",
            "AutoModelForCausalLM": "modeling_rwkv5.RwkvForCausalLM",
        },
        "attention_hidden_size": 2560,
        "bos_token_id": 0,
        "context_length": 4096,
        "eos_token_id": 0,
        "head_size": 64,
        "hidden_size": 2560,
        "intermediate_size": None,
        "layer_norm_epsilon": 1e-05,
        "model_type": "rwkv5",
        "model_version": "5_2",
        "num_hidden_layers": 32,
        "rescale_every": 6,
        "tie_word_embeddings": True,
        "transformers_version": "4.34.0",
        "use_cache": True,
        "vocab_size": 65536,
    },
    "orion": {
        "architectures": ["OrionForCausalLM"],
        "auto_map": {
            "AutoConfig": "configuration_orion.OrionConfig",
            "AutoModelForCausalLM": "modeling_orion.OrionForCausalLM",
        },
        "tokenizer_class": "OrionTokenizer",
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 5120,
        "model_type": "orion",
        "initializer_range": 0.02,
        "intermediate_size": 15360,
        "max_position_embeddings": 4096,
        "max_sequence_length": 4096,
        "num_attention_heads": 40,
        "num_hidden_layers": 40,
        "num_key_value_heads": 40,
        "pad_token_id": 0,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.34.0",
        "use_cache": True,
        "vocab_size": 84608,
    },
    "llava": {
        "architectures": ["LlavaForConditionalGeneration"],
        "ignore_index": -100,
        "image_token_index": 32000,
        "model_type": "llava",
        "pad_token_id": 32001,
        "projector_hidden_act": "gelu",
        "text_config": {
            "_name_or_path": "meta-llama/Llama-2-7b-hf",
            "architectures": ["LlamaForCausalLM"],
            "max_position_embeddings": 4096,
            "model_type": "llama",
            "rms_norm_eps": 1e-05,
            "torch_dtype": "float16",
            "vocab_size": 32064,
        },
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.36.0.dev0",
        "vision_config": {
            "hidden_size": 1024,
            "image_size": 336,
            "intermediate_size": 4096,
            "model_type": "clip_vision_model",
            "num_attention_heads": 16,
            "num_hidden_layers": 24,
            "patch_size": 14,
            "projection_dim": 768,
            "vocab_size": 32000,
        },
        "vision_feature_layer": -2,
        "vision_feature_select_strategy": "default",
        "vocab_size": 32064,
    },
    "chatglm": {
        "architectures": ["ChatGLMModel"],
        "model_type": "chatglm",
        "add_bias_linear": False,
        "add_qkv_bias": True,
        "apply_query_key_layer_scaling": True,
        "apply_residual_connection_post_layernorm": False,
        "attention_dropout": 0.0,
        "attention_softmax_in_fp32": True,
        "bias_dropout_fusion": True,
        "ffn_hidden_size": 13696,
        "fp32_residual_connection": False,
        "hidden_dropout": 0.0,
        "hidden_size": 4096,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-05,
        "multi_query_attention": True,
        "multi_query_group_num": 2,
        "num_attention_heads": 32,
        "num_layers": 28,
        "original_rope": True,
        "padded_vocab_size": 65024,
        "post_layer_norm": True,
        "rmsnorm": True,
        "seq_length": 8192,
        "use_cache": True,
        "torch_dtype": "float16",
        "transformers_version": "4.30.2",
        "tie_word_embeddings": False,
        "eos_token_id": 2,
        "pad_token_id": 0,
    },
    "llama3_1_8b": {
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 128000,
        "eos_token_id": [128001, 128008, 128009],
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 131072,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "factor": 8.0,
            "low_freq_factor": 1.0,
            "high_freq_factor": 4.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3",
        },
        "rope_theta": 500000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.3",
        "use_cache": True,
        "vocab_size": 128256,
    },
    "llama3_1_70b": {
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 128000,
        "eos_token_id": [128001, 128008, 128009],
        "hidden_act": "silu",
        "hidden_size": 8192,
        "initializer_range": 0.02,
        "intermediate_size": 28672,
        "max_position_embeddings": 131072,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 64,
        "num_hidden_layers": 80,
        "num_key_value_heads": 8,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "factor": 8.0,
            "low_freq_factor": 1.0,
            "high_freq_factor": 4.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3",
        },
        "rope_theta": 500000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.3",
        "use_cache": True,
        "vocab_size": 128256,
    },
    "llama3_2_1b": {
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 128000,
        "eos_token_id": [128001, 128008, 128009],
        "head_dim": 64,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "max_position_embeddings": 131072,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 16,
        "num_key_value_heads": 8,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "factor": 32.0,
            "high_freq_factor": 4.0,
            "low_freq_factor": 1.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3",
        },
        "rope_theta": 500000.0,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.45.0.dev0",
        "use_cache": True,
        "vocab_size": 128256,
    },
    "llama3_2_3b": {
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 128000,
        "eos_token_id": [128001, 128008, 128009],
        "head_dim": 128,
        "hidden_act": "silu",
        "hidden_size": 3072,
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "max_position_embeddings": 131072,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 24,
        "num_hidden_layers": 28,
        "num_key_value_heads": 8,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "factor": 32.0,
            "high_freq_factor": 4.0,
            "low_freq_factor": 1.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3",
        },
        "rope_theta": 500000.0,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.45.0.dev0",
        "use_cache": True,
        "vocab_size": 128256,
    },
    # Commented llama3 to save CI time
    # "llama3_8b": {
    #     "architectures": ["LlamaForCausalLM"],
    #     "attention_bias": False,
    #     "attention_dropout": 0.0,
    #     "bos_token_id": 128000,
    #     "eos_token_id": 128001,
    #     "hidden_act": "silu",
    #     "hidden_size": 4096,
    #     "initializer_range": 0.02,
    #     "intermediate_size": 14336,
    #     "max_position_embeddings": 8192,
    #     "model_type": "llama",
    #     "num_attention_heads": 32,
    #     "num_hidden_layers": 32,
    #     "num_key_value_heads": 8,
    #     "pretraining_tp": 1,
    #     "rms_norm_eps": 1e-05,
    #     "rope_scaling": None,
    #     "rope_theta": 500000.0,
    #     "tie_word_embeddings": False,
    #     "torch_dtype": "bfloat16",
    #     "transformers_version": "4.40.0.dev0",
    #     "use_cache": True,
    #     "vocab_size": 128256,
    # },
    # "llama3_70b": {
    #     "architectures": ["LlamaForCausalLM"],
    #     "attention_bias": False,
    #     "attention_dropout": 0.0,
    #     "bos_token_id": 128000,
    #     "eos_token_id": 128001,
    #     "hidden_act": "silu",
    #     "hidden_size": 8192,
    #     "initializer_range": 0.02,
    #     "intermediate_size": 28672,
    #     "max_position_embeddings": 8192,
    #     "model_type": "llama",
    #     "num_attention_heads": 64,
    #     "num_hidden_layers": 80,
    #     "num_key_value_heads": 8,
    #     "pretraining_tp": 1,
    #     "rms_norm_eps": 1e-05,
    #     "rope_scaling": None,
    #     "rope_theta": 500000.0,
    #     "tie_word_embeddings": False,
    #     "torch_dtype": "bfloat16",
    #     "transformers_version": "4.40.0.dev0",
    #     "use_cache": True,
    #     "vocab_size": 128256,
    # },
    "snowflake-arctic-embed-m": {
        "architectures": ["BertModel"],
        "attention_probs_dropout_prob": 0.1,
        "classifier_dropout": None,
        "gradient_checkpointing": False,
        "hidden_act": "gelu",
        "hidden_dropout_prob": 0.1,
        "hidden_size": 768,
        "initializer_range": 0.02,
        "intermediate_size": 3072,
        "layer_norm_eps": 1e-12,
        "max_position_embeddings": 512,
        "model_type": "bert",
        "num_attention_heads": 12,
        "num_hidden_layers": 12,
        "pad_token_id": 0,
        "position_embedding_type": "absolute",
        "torch_dtype": "float32",
        "transformers_version": "4.36.1",
        "type_vocab_size": 2,
        "use_cache": True,
        "vocab_size": 30522,
    },
    # "snowflake-arctic-embed-s": {
    #     "architectures": ["BertModel"],
    #     "attention_probs_dropout_prob": 0.1,
    #     "classifier_dropout": None,
    #     "hidden_act": "gelu",
    #     "hidden_dropout_prob": 0.1,
    #     "hidden_size": 384,
    #     "initializer_range": 0.02,
    #     "intermediate_size": 1536,
    #     "layer_norm_eps": 1e-12,
    #     "max_position_embeddings": 512,
    #     "model_type": "bert",
    #     "num_attention_heads": 12,
    #     "num_hidden_layers": 12,
    #     "pad_token_id": 0,
    #     "position_embedding_type": "absolute",
    #     "torch_dtype": "float32",
    #     "transformers_version": "4.36.1",
    #     "type_vocab_size": 2,
    #     "use_cache": True,
    #     "vocab_size": 30522,
    # },
    "stablelm-2-zephyr-1_6b": {
        "architectures": ["StableLmForCausalLM"],
        "bos_token_id": 100257,
        "eos_token_id": 100257,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 5632,
        "max_position_embeddings": 4096,
        "model_type": "stablelm",
        "layer_norm_eps": 1e-05,
        "num_attention_heads": 32,
        "num_hidden_layers": 24,
        "num_key_value_heads": 32,
        "partial_rotary_factor": 0.25,
        "rope_theta": 10000,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.38.0",
        "use_cache": True,
        "use_qkv_bias": True,
        "vocab_size": 100352,
    },
    "qwen2_0_5b": {
        "architectures": ["Qwen2ForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 896,
        "initializer_range": 0.02,
        "intermediate_size": 4864,
        "max_position_embeddings": 32768,
        "max_window_layers": 24,
        "model_type": "qwen2",
        "num_attention_heads": 14,
        "num_hidden_layers": 24,
        "num_key_value_heads": 2,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 32768,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.40.1",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 151936,
    },
    "qwen2_1_5b": {
        "architectures": ["Qwen2ForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 1536,
        "initializer_range": 0.02,
        "intermediate_size": 8960,
        "max_position_embeddings": 32768,
        "max_window_layers": 28,
        "model_type": "qwen2",
        "num_attention_heads": 12,
        "num_hidden_layers": 28,
        "num_key_value_heads": 2,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 32768,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.40.1",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 151936,
    },
    "qwen2.5_3b": {
        "architectures": ["Qwen2ForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 11008,
        "max_position_embeddings": 32768,
        "max_window_layers": 70,
        "model_type": "qwen2",
        "num_attention_heads": 16,
        "num_hidden_layers": 36,
        "num_key_value_heads": 2,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 32768,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.43.1",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 151936,
    },
    "qwen2_7b": {
        "architectures": ["Qwen2ForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 151643,
        "eos_token_id": 151645,
        "hidden_act": "silu",
        "hidden_size": 3584,
        "initializer_range": 0.02,
        "intermediate_size": 18944,
        "max_position_embeddings": 32768,
        "max_window_layers": 28,
        "model_type": "qwen2",
        "num_attention_heads": 28,
        "num_hidden_layers": 28,
        "num_key_value_heads": 4,
        "rms_norm_eps": 1e-06,
        "rope_theta": 1000000.0,
        "sliding_window": 131072,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.41.2",
        "use_cache": True,
        "use_sliding_window": False,
        "vocab_size": 152064,
    },
    "internlm2": {
        "architectures": ["InternLM2ForCausalLM"],
        "attn_implementation": "eager",
        "bias": False,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 32768,
        "model_type": "internlm2",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "pad_token_id": 2,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 1000000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.37.1",
        "use_cache": True,
        "vocab_size": 92544,
    },
    "internlm2_5_7b": {
        "architectures": ["InternLM2ForCausalLM"],
        "attn_implementation": "eager",
        "bias": False,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "max_position_embeddings": 32768,
        "model_type": "internlm2",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "pad_token_id": 2,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {"type": "dynamic", "factor": 2.0},
        "rope_theta": 1000000,
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.41.0",
        "use_cache": True,
        "vocab_size": 92544,
        "pretraining_tp": 1,
    },
    "starcoder2": {
        "activation_function": "gelu",
        "architectures": ["Starcoder2ForCausalLM"],
        "attention_dropout": 0.1,
        "residual_dropout": 0.1,
        "embedding_dropout": 0.1,
        "attention_softmax_in_fp32": True,
        "bos_token_id": 0,
        "eos_token_id": 0,
        "hidden_act": "gelu_pytorch_tanh",
        "hidden_size": 4608,
        "initializer_range": 0.018042,
        "intermediate_size": 18432,
        "layer_norm_epsilon": 1e-05,
        "max_position_embeddings": 16384,
        "mlp_type": "default",
        "model_type": "starcoder2",
        "norm_epsilon": 1e-05,
        "norm_type": "layer_norm",
        "num_attention_heads": 36,
        "num_hidden_layers": 32,
        "num_key_value_heads": 4,
        "rope_theta": 1000000,
        "scale_attention_softmax_in_fp32": True,
        "scale_attn_weights": True,
        "sliding_window": 4096,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.37.0.dev0",
        "use_bias": True,
        "use_cache": True,
        "vocab_size": 49152,
    },
    "smollm_1_7b": {
        "_name_or_path": "HuggingFaceTB/cosmo2-1.7B-webinst-sc2",
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 8192,
        "max_position_embeddings": 2048,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 32,
        "num_hidden_layers": 24,
        "num_key_value_heads": 32,
        "pad_token_id": 2,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.3",
        "use_cache": True,
        "vocab_size": 49152,
    },
    "smollm_360m": {
        "_name_or_path": "HuggingFaceTB/cosmo2-350M-webinst-sc2",
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 960,
        "initializer_range": 0.02,
        "intermediate_size": 2560,
        "max_position_embeddings": 2048,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 15,
        "num_hidden_layers": 32,
        "num_key_value_heads": 5,
        "pad_token_id": 2,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.3",
        "use_cache": True,
        "vocab_size": 49152,
    },
    "smollm_135m": {
        "_name_or_path": "HuggingFaceTB/cosmo2-135M-webinst-sc2",
        "architectures": ["LlamaForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 576,
        "initializer_range": 0.02,
        "intermediate_size": 1536,
        "max_position_embeddings": 2048,
        "mlp_bias": False,
        "model_type": "llama",
        "num_attention_heads": 9,
        "num_hidden_layers": 30,
        "num_key_value_heads": 3,
        "pad_token_id": 2,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": True,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.42.3",
        "use_cache": True,
        "vocab_size": 49152,
    },
    "aya-23": {
        "architectures": ["CohereForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 5,
        "eos_token_id": 255001,
        "hidden_act": "silu",
        "hidden_size": 4096,
        "initializer_range": 0.02,
        "intermediate_size": 14336,
        "layer_norm_eps": 1e-05,
        "logit_scale": 0.0625,
        "max_position_embeddings": 8192,
        "model_type": "cohere",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 8,
        "pad_token_id": 0,
        "rope_theta": 10000,
        "torch_dtype": "float16",
        "transformers_version": "4.40.0.dev0",
        "use_cache": True,
        "use_qk_norm": False,
        "vocab_size": 256000,
    },
    "minicpm_2b": {
        "architectures": ["MiniCPMForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2304,
        "initializer_range": 0.1,
        "intermediate_size": 5760,
        "max_position_embeddings": 65536,
        "max_length": 131072,
        "model_type": "minicpm",
        "num_attention_heads": 36,
        "num_hidden_layers": 40,
        "num_key_value_heads": 36,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {"type": "dynamic", "factor": 4.0},
        "torch_dtype": "bfloat16",
        "transformers_version": "4.36.0",
        "use_cache": True,
        "vocab_size": 122760,
        "scale_emb": 12,
        "dim_model_base": 256,
        "scale_depth": 1.4,
        "tie_word_embeddings": False,
        "rope_theta": 1000000.0,
    },
    "minicpm_2b_sft_bf16": {
        "architectures": ["MiniCPMForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2304,
        "initializer_range": 0.1,
        "intermediate_size": 5760,
        "max_position_embeddings": 4096,
        "model_type": "minicpm",
        "num_attention_heads": 36,
        "num_hidden_layers": 40,
        "num_key_value_heads": 36,
        "rms_norm_eps": 1e-05,
        "torch_dtype": "bfloat16",
        "tie_word_embeddings": True,
        "transformers_version": "4.36.0",
        "use_cache": True,
        "vocab_size": 122753,
        "scale_emb": 12,
        "dim_model_base": 256,
        "scale_depth": 1.4,
    },
    "minicpm-moe-8x2b": {
        "architectures": ["MiniCPMForCausalLM"],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": 2304,
        "initializer_range": 0.1,
        "intermediate_size": 5760,
        "max_position_embeddings": 4096,
        "model_type": "minicpm",
        "num_attention_heads": 36,
        "num_hidden_layers": 40,
        "num_key_value_heads": 36,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "torch_dtype": "bfloat16",
        "tie_word_embeddings": True,
        "transformers_version": "4.36.0",
        "use_cache": True,
        "vocab_size": 122753,
        "scale_emb": 12,
        "dim_model_base": 256,
        "scale_depth": 1.4,
        "num_experts": 8,
        "num_experts_per_tok": 2,
    },
    "deepseek": {
        "architectures": ["DeepseekForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "bos_token_id": 100000,
        "eos_token_id": 100001,
        "first_k_dense_replace": 1,
        "hidden_act": "silu",
        "hidden_size": 2048,
        "initializer_range": 0.02,
        "intermediate_size": 10944,
        "max_position_embeddings": 4096,
        "model_type": "deepseek",
        "moe_intermediate_size": 1408,
        "moe_layer_freq": 1,
        "n_routed_experts": 64,
        "n_shared_experts": 2,
        "norm_topk_prob": False,
        "num_attention_heads": 16,
        "num_experts_per_tok": 6,
        "num_hidden_layers": 28,
        "num_key_value_heads": 16,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-06,
        "rope_scaling": None,
        "rope_theta": 10000,
        "scoring_func": "softmax",
        "tie_word_embeddings": False,
        "torch_dtype": "bfloat16",
        "transformers_version": "4.36.2",
        "use_cache": True,
        "vocab_size": 102400,
    },
    "gpt_j": {
        "activation_function": "gelu_new",
        "architectures": ["GPTJForCausalLM"],
        "attn_pdrop": 0.0,
        "bos_token_id": 50256,
        "embd_pdrop": 0.0,
        "eos_token_id": 50256,
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "model_type": "gptj",
        "n_embd": 4096,
        "n_head": 16,
        "n_inner": None,
        "n_layer": 28,
        "n_positions": 2048,
        "resid_pdrop": 0.0,
        "rotary": True,
        "rotary_dim": 64,
        "scale_attn_weights": True,
        "summary_activation": None,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": True,
        "summary_type": "cls_index",
        "summary_use_proj": True,
        "rope_scaling": {"rope_type": "gptj"},
        "tie_word_embeddings": False,
        "tokenizer_class": "GPT2Tokenizer",
        "transformers_version": "4.18.0.dev0",
        "use_cache": True,
        "vocab_size": 50400,
    },
}
